Looking at our data sets, I compare performance by level of difficulty. I also try to extract buzz words and prominent topics at each level looking at frequency of words in the title, description, and metadata. Surprisingly, intermediate level R videos perform the best.
library(readr)
library(ggplot2)
library(runner)
library(stringr)
library(tidytext)
library("stopwords")
library("wordcloud")
library(ggrepel)
library(psych)
directory <- "data/"
rUniqueFilePath <- paste(directory, "R Programming Unique.csv", sep = "")
rVideosUnique <- read_delim(rUniqueFilePath, trim_ws = TRUE)
rVideosUniqueRelevant <- rVideosUnique %>%
filter(relevant == TRUE)
# "Misinformation"
# 66/92 = 71%
beginnerVideosUniqueRelevant <- rVideosUnique %>%
filter(level == "beginner")
# 71/95 = 75%
intermediateVideosUniqueRelevant <- rVideosUnique %>%
filter(level == "intermediate")
# 90/91 = 99%
advancedVideosUniqueRelevant <- rVideosUnique %>%
filter(level == "advanced")
# When did the market saturate?
ggplot(beginnerVideosUniqueRelevant, aes(publishedAt, viewCount)) +
ggtitle("Beginner Relevancy by Date") +
geom_point(aes(colour = factor(relevant)))

ggplot(intermediateVideosUniqueRelevant, aes(publishedAt, viewCount)) +
ggtitle("Intermediate Relevancy by Date") +
geom_point(aes(colour = factor(relevant)))

ggplot(advancedVideosUniqueRelevant, aes(publishedAt, viewCount)) +
ggtitle("Advanced Relevancy by Date") +
geom_point(aes(colour = factor(relevant)))

# Assuming most of a video's traffic occurs within the first couple of months, then viewership has increased at a steady rate
# Amount of older videos can also speak to scarcity of newer videos as sorted by relevancy, one might expect new videos to be more relevant
ggplot(rVideosUniqueRelevant %>%
arrange(publishedAt), aes(publishedAt, sum_run(
x = viewCount,
idx = publishedAt
))
) +
ggtitle("Cumulative Views by Date") +
geom_smooth(method = lm) +
geom_point()

ggplot(rVideosUniqueRelevant, aes(publishedAt, viewCount)) +
ggtitle("Views by Date") +
geom_point(aes(colour = factor(level)))

ggplot(rVideosUniqueRelevant %>%
filter(viewCount < 1000), aes(publishedAt, viewCount)) +
ggtitle("Views by Date") +
geom_point(aes(colour = factor(level)))

ggplot(rVideosUniqueRelevant, aes(rVersion, viewCount)) +
geom_point(aes(colour = factor(level))) +
ggtitle("Views by R version") +
theme(axis.text.x = element_text(angle = 45))

rVideosUniqueRelevant$viewCount %>%
describe
## vars n mean sd median trimmed mad min max range skew
## X1 1 227 36420.6 154786.2 196 5635.71 268.35 0 1814387 1814387 8.38
## kurtosis se
## X1 83.93 10273.52
rVideosUniqueRelevant %>%
summary
## video_id publishedAt channelId
## Length:227 Min. :2010-02-07 00:00:00 Length:227
## Class :character 1st Qu.:2019-06-01 00:00:00 Class :character
## Mode :character Median :2020-09-17 00:00:00 Mode :character
## Mean :2019-08-22 20:05:17
## 3rd Qu.:2021-01-22 00:00:00
## Max. :2021-10-30 00:00:00
## title description thumbnails.default.url
## Length:227 Length:227 Length:227
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## channelTitle viewCount likeCount dislikeCount
## Length:227 Min. : 0 Min. : 0.0 Min. : 0.00
## Class :character 1st Qu.: 37 1st Qu.: 1.0 1st Qu.: 0.00
## Mode :character Median : 196 Median : 4.0 Median : 0.00
## Mean : 36421 Mean : 649.5 Mean : 10.65
## 3rd Qu.: 4324 3rd Qu.: 64.5 3rd Qu.: 1.50
## Max. :1814387 Max. :41152.0 Max. :513.00
## favoriteCount commentCount tags level
## Min. :0 Min. : 0.00 Length:227 Length:227
## 1st Qu.:0 1st Qu.: 0.00 Class :character Class :character
## Median :0 Median : 0.00 Mode :character Mode :character
## Mean :0 Mean : 34.89
## 3rd Qu.:0 3rd Qu.: 4.50
## Max. :0 Max. :886.00
## rVersion relevant
## Length:227 Mode:logical
## Class :character TRUE:227
## Mode :character
##
##
##
beginnerVideosUniqueRelevant$viewCount %>%
describe
## vars n mean sd median trimmed mad min max range skew
## X1 1 92 465864.9 1874922 2635.5 101214.3 3905.17 0 13934381 13934381 6.09
## kurtosis se
## X1 37.76 195474.1
beginnerVideosUniqueRelevant %>%
summary
## video_id publishedAt channelId
## Length:92 Min. :2013-10-31 00:00:00 Length:92
## Class :character 1st Qu.:2019-02-21 12:00:00 Class :character
## Mode :character Median :2020-04-16 12:00:00 Mode :character
## Mean :2019-09-13 10:26:05
## 3rd Qu.:2020-09-22 06:00:00
## Max. :2021-10-30 00:00:00
## title description thumbnails.default.url
## Length:92 Length:92 Length:92
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## channelTitle viewCount likeCount dislikeCount
## Length:92 Min. : 0 Min. : 0.0 Min. : 0.00
## Class :character 1st Qu.: 88 1st Qu.: 1.0 1st Qu.: 0.00
## Mode :character Median : 2636 Median : 45.5 Median : 1.00
## Mean : 465865 Mean : 10781.2 Mean : 186.73
## 3rd Qu.: 163410 3rd Qu.: 3679.8 3rd Qu.: 62.25
## Max. :13934381 Max. :336767.0 Max. :4412.00
## favoriteCount commentCount tags level
## Min. :0 Min. : 0.0 Length:92 Length:92
## 1st Qu.:0 1st Qu.: 0.0 Class :character Class :character
## Median :0 Median : 3.5 Mode :character Mode :character
## Mean :0 Mean : 457.0
## 3rd Qu.:0 3rd Qu.: 222.2
## Max. :0 Max. :15001.0
## rVersion relevant
## Length:92 Mode :logical
## Class :character FALSE:25
## Mode :character TRUE :66
## NA's :1
##
##
intermediateVideosUniqueRelevant$viewCount %>%
describe
## vars n mean sd median trimmed mad min max range skew
## X1 1 95 220691.8 739024 619 35749.96 911.8 0 4617704 4617704 4.41
## kurtosis se
## X1 20.27 75822.26
intermediateVideosUniqueRelevant %>%
summary
## video_id publishedAt channelId
## Length:95 Min. :2010-02-07 00:00:00 Length:95
## Class :character 1st Qu.:2017-11-13 12:00:00 Class :character
## Mode :character Median :2020-05-05 00:00:00 Mode :character
## Mean :2018-09-25 20:27:47
## 3rd Qu.:2021-01-09 12:00:00
## Max. :2021-12-02 00:00:00
## title description thumbnails.default.url
## Length:95 Length:95 Length:95
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## channelTitle viewCount likeCount dislikeCount
## Length:95 Min. : 0 Min. : 0 Min. : 0.00
## Class :character 1st Qu.: 62 1st Qu.: 1 1st Qu.: 0.00
## Mode :character Median : 619 Median : 6 Median : 0.00
## Mean : 220692 Mean : 3954 Mean : 71.79
## 3rd Qu.: 75637 3rd Qu.: 567 3rd Qu.: 6.50
## Max. :4617704 Max. :87535 Max. :1679.00
## favoriteCount commentCount tags level
## Min. :0 Min. : 0.0 Length:95 Length:95
## 1st Qu.:0 1st Qu.: 0.0 Class :character Class :character
## Median :0 Median : 0.0 Mode :character Mode :character
## Mean :0 Mean : 108.4
## 3rd Qu.:0 3rd Qu.: 52.5
## Max. :0 Max. :2227.0
## rVersion relevant
## Length:95 Mode :logical
## Class :character FALSE:19
## Mode :character TRUE :71
## NA's :5
##
##
advancedVideosUniqueRelevant$viewCount %>%
describe
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 91 4823.18 13816.58 89 1321.68 118.61 0 86035 86035 4.1 17.89
## se
## X1 1448.37
advancedVideosUniqueRelevant %>%
summary
## video_id publishedAt channelId
## Length:91 Min. :2015-11-25 00:00:00 Length:91
## Class :character 1st Qu.:2020-07-12 00:00:00 Class :character
## Mode :character Median :2020-11-05 00:00:00 Mode :character
## Mean :2020-06-26 00:15:49
## 3rd Qu.:2021-03-07 00:00:00
## Max. :2021-10-28 00:00:00
## title description thumbnails.default.url
## Length:91 Length:91 Length:91
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## channelTitle viewCount likeCount dislikeCount
## Length:91 Min. : 0 Min. : 0.00 Min. : 0.000
## Class :character 1st Qu.: 31 1st Qu.: 0.00 1st Qu.: 0.000
## Mode :character Median : 89 Median : 2.00 Median : 0.000
## Mean : 4823 Mean : 92.35 Mean : 1.604
## 3rd Qu.: 1263 3rd Qu.: 17.50 3rd Qu.: 0.000
## Max. :86035 Max. :1426.00 Max. :22.000
## favoriteCount commentCount tags level
## Min. :0 Min. : 0.000 Length:91 Length:91
## 1st Qu.:0 1st Qu.: 0.000 Class :character Class :character
## Median :0 Median : 0.000 Mode :character Mode :character
## Mean :0 Mean : 6.681
## 3rd Qu.:0 3rd Qu.: 2.000
## Max. :0 Max. :188.000
## rVersion relevant
## Length:91 Mode :logical
## Class :character FALSE:1
## Mode :character TRUE :90
##
##
##
ggplot(rVideosUniqueRelevant, aes(level, viewCount)) +
ggtitle("Views by Difficulty") +
geom_boxplot()

ggplot(rVideosUniqueRelevant, aes(level, likeCount)) +
ggtitle("Likes by Difficulty") +
geom_boxplot()

ggplot(rVideosUniqueRelevant, aes(level, dislikeCount)) +
ggtitle("Dislikes by Difficulty") +
geom_boxplot()

ggplot(rVideosUniqueRelevant, aes(level, commentCount)) +
ggtitle("Comments by Difficulty") +
geom_boxplot()

rVideosUniqueRelevantChannels <- rVideosUniqueRelevant %>%
group_by(channelTitle) %>%
summarise(n = n(), viewCount = sum(viewCount))
ggplot(rVideosUniqueRelevantChannels, aes(n, viewCount)) +
geom_point() +
ggtitle("All Channel Videos Count by Views") +
geom_label_repel(aes(label = channelTitle),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50')

beginnerVideosUniqueRelevantChannels <- beginnerVideosUniqueRelevant %>%
group_by(channelTitle) %>%
summarise(n = n(), viewCount = sum(viewCount))
ggplot(beginnerVideosUniqueRelevantChannels, aes(n, viewCount)) +
geom_point() +
ggtitle("Beginner Channel Video Count by Views") +
geom_label_repel(aes(label = channelTitle),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50')

intermediateVideosUniqueRelevantChannels <- intermediateVideosUniqueRelevant %>%
group_by(channelTitle) %>%
summarise(n = n(), viewCount = sum(viewCount))
ggplot(intermediateVideosUniqueRelevantChannels, aes(n, viewCount)) +
geom_point() +
ggtitle("Intermediate Channel Video Count by Views") +
geom_label_repel(aes(label = channelTitle),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50')

advancedVideosUniqueRelevantChannels <- advancedVideosUniqueRelevant %>%
group_by(channelTitle) %>%
summarise(n = n(), viewCount = sum(viewCount))
ggplot(advancedVideosUniqueRelevantChannels, aes(n, viewCount)) +
geom_point() +
ggtitle("Advanced Channel Video Count by Views") +
geom_label_repel(aes(label = channelTitle),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50')

maxWords = 100
filterRegex <- ".*[0-9][0-9]+$|^[0-9]*$|.*\\..*"
titleWords <- rVideosUniqueRelevant[,4] %>%
unnest_tokens(output = word, input = title) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

titleWords <- beginnerVideosUniqueRelevant[,4] %>%
unnest_tokens(output = word, input = title) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

titleWords <- intermediateVideosUniqueRelevant[,4] %>%
unnest_tokens(output = word, input = title) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

titleWords <- advancedVideosUniqueRelevant[,4] %>%
unnest_tokens(output = word, input = title) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
wordcloud(words = titleWords$word, freq = titleWords$n, max.words = maxWords)

descriptionWords <- rVideosUniqueRelevant[,5] %>%
unnest_tokens(output = word, input = description) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

descriptionWords <- beginnerVideosUniqueRelevant[,5] %>%
unnest_tokens(output = word, input = description) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

descriptionWords <- intermediateVideosUniqueRelevant[,5] %>%
unnest_tokens(output = word, input = description) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

descriptionWords <- advancedVideosUniqueRelevant[,5] %>%
unnest_tokens(output = word, input = description) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
wordcloud(words = descriptionWords$word, freq = descriptionWords$n, max.words = maxWords)

rVideosUniqueRelevant$tags <- gsub(",", " ", rVideosUniqueRelevant$tags)
tagsWords <- rVideosUniqueRelevant[,13] %>%
unnest_tokens(output = word, input = tags) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)

beginnerVideosUniqueRelevant$tags <- gsub(",", " ", beginnerVideosUniqueRelevant$tags)
tagsWords <- beginnerVideosUniqueRelevant[,13] %>%
unnest_tokens(output = word, input = tags) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)

intermediateVideosUniqueRelevant$tags <- gsub(",", " ", intermediateVideosUniqueRelevant$tags)
tagsWords <- intermediateVideosUniqueRelevant[,13] %>%
unnest_tokens(output = word, input = tags) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)

advancedVideosUniqueRelevant$tags <- gsub(",", " ", advancedVideosUniqueRelevant$tags)
tagsWords <- advancedVideosUniqueRelevant[,13] %>%
unnest_tokens(output = word, input = tags) %>%
anti_join(get_stopwords()) %>%
filter(!str_detect(word, regex(filterRegex))) %>%
count(word)
wordcloud(words = tagsWords$word, freq = tagsWords$n, max.words = maxWords)
